Quantile normalization
Group data by normalization units (comb_id)
library(cmapR)
library(dplyr)
source('./functions/Functions_general.R')
source('./functions/Functions_mod_z_score.R')
end_char = substr(outdir, nchar(outdir), nchar(outdir))
if(end_char != "/"){
outdir = paste0(outdir, '/')
}
if(!dir.exists(outdir)){
print("Creating output directory")
dir.create(outdir)
}
getwd()
[1] "/Users/sophie_chen/Downloads/psa_rnaseq_manuscript_rproject-main"
#VST data calculated in 01_qc_and_vst.Rmd script
data_vst = parse_gctx(vst_filepath)
parsing as GCT v1.3
/Users/sophie_chen/downloads/psa_rnaseq_manuscript_rproject-main/output/mocp_0193_vst_remove_low_count_samples_and_genes_n120x5849.gct 5849 rows, 120 cols, 16 row descriptors, 23 col descriptors
data_mat = data_vst@mat
row_meta = data_vst@rdesc
col_meta = data_vst@cdesc
#Compound metadata
# comp_meta = read.csv('./reference_files/Metadata_compound_231129.csv', stringsAsFactors = F)
Quantile normalization within each normalization unit (normalization
across each sample)
#Define groups (batches) to do quantile normalization
temp = col_meta[[batch_column_name]]
col_meta[["comb_id"]] = temp
remove(temp)
comb_id_unique = unique(col_meta$comb_id)
print(comb_id_unique)
[1] "MOCP_0193_1"
#update metadata in gctx file
data_vst@cdesc = col_meta
Plotting qnorm data for QC visualization
#Loop over each comb_id
data_qnorm = data_mat
flag0 = rep(0, length(comb_id_unique)) #flag to check if all comb_ids ran
for(num in 1:length(comb_id_unique)){
idx = which(col_meta$comb_id == comb_id_unique[num])
sum(length(idx))
if(sum(length(idx))>0){
data_qnorm[,idx] = qnorm_median(data_mat[,idx])
}else{
flag0[num] = 1
}
}
#If any comb_ids had no samples, sum(flag0) > 0
if(sum(flag0) > 0){
print("Warning: Not all comb_ids were quantile normalized")
}else{
print("All comb_ids quantile normalized")
}
[1] "All comb_ids quantile normalized"
any(is.na(data_qnorm))
[1] FALSE
#Make boxplot of each sample
col_meta_select = dplyr::select(col_meta, id, comb_id) %>% distinct()
data_qnorm_df = as.data.frame(data_qnorm)
data_qnorm_df$gene_id = rownames(data_qnorm_df)
data_qnorm_df = tidyr::gather(data_qnorm_df, key = "id", value = "qnorm_vst", -gene_id)
data_qnorm_df = dplyr::left_join(data_qnorm_df, col_meta_select, by = "id")
ggplot(data_qnorm_df, aes(x = id, y = qnorm_vst, group = id)) +
geom_boxplot()+
facet_wrap(~comb_id, scales = "free_x")+
theme_bw(base_size = 14)

data_qnorm_pca = run_pca(data_qnorm, col_meta = col_meta, color_col = "strain_id", scale0 = T)



Z-score across each gene
Prepare with QC
#Save qnorm data to gct
qnorm_file_path = paste0(outdir, paste(run_name,run_name_suffix, "qnorm_remove_low_count_samples_and_genes", sep = "_"))
save_to_gct(data_mat = data_qnorm, data_cdesc = col_meta, data_rdesc = row_meta, savefilepath = qnorm_file_path)
Saving file to /Users/sophie_chen/downloads/psa_rnaseq_manuscript_rproject-main/output/mocp_0193__qnorm_remove_low_count_samples_and_genes_n120x5849.gct
Dimensions of matrix: [5849x120]
Setting precision to 4
Saved.
Clip extreme z-score values
zscore_list = list()
data_zscore = data_qnorm
flag0 = rep(0, length(comb_id_unique)) #flag to check if all comb_ids ran
for(num in 1:length(comb_id_unique)){
idx = which(col_meta$comb_id == comb_id_unique[num])
sum(length(idx))
if(sum(length(idx))>0){
zscore_list[[num]] = robust_zscore(data_qnorm[,idx], dim0 = 1) #z-score along rows, outputs zscores, medians and mads
data_zscore[,idx] = zscore_list[[num]][[1]] #only taking the z-scores
}else{
flag0[num] = 1
}
}
#Any comb_ids didn't get z-scored?
if(sum(flag0) > 0){
print("Warning: Not all comb_ids were standardized")
}
Plot PCA
#Clip z-score values to account for extreme cases where genes with low count had low variation (i.e. low median absolute deviation)
#Look at genes' z-score distribution
hist(data_zscore, breaks = 100)

quantile(data_zscore, c(0.001, 0.005, 0.01, 0.05, 0.95, 0.99, 0.995, 0.999))
0.1% 0.5% 1% 5% 95% 99% 99.5% 99.9%
-5.838539 -3.880762 -3.201871 -1.892837 1.819762 3.351557 4.266416 7.193971
data_zscore = base::pmax(data_zscore, -1*clip_value)
data_zscore = base::pmin(data_zscore, clip_value)
Save z-scores into GCT file
data_zscore_pca = run_pca(data_zscore, col_meta = col_meta, color_col = "strain_id", scale0 = T)



Collapse replicates
if(!all(colnames(data_zscore) == col_meta$id)){
print("Warning: column names of matrix & metadata IDs do not match")
}
if(!all(rownames(data_zscore) == row_meta$gene_id)){
print("Warning: row names do not match gene IDs in metadata")
}
save_to_gct(data_mat = data_zscore, data_cdesc = col_meta, data_rdesc = row_meta, savefilepath = paste0(outdir, paste(run_name,run_name_suffix, "zscore_remove_low_count_samples_and_genes", sep = "_")))
Saving file to /Users/sophie_chen/downloads/psa_rnaseq_manuscript_rproject-main/output/mocp_0193__zscore_remove_low_count_samples_and_genes_n120x5849.gct
Dimensions of matrix: [5849x120]
Setting precision to 4
Saved.
num_row_zscore = dim(data_zscore)[1]
num_col_zscore = dim(data_zscore)[2]
Save replicate collapsed data as gct file
zscore = cmapR::parse_gctx(paste0(outdir, paste(run_name,run_name_suffix, "zscore_remove_low_count_samples_and_genes", sep = "_"), "_n", num_col_zscore, "x", num_row_zscore, ".gct"))
parsing as GCT v1.3
/Users/sophie_chen/downloads/psa_rnaseq_manuscript_rproject-main/output/mocp_0193__zscore_remove_low_count_samples_and_genes_n120x5849.gct 5849 rows, 120 cols, 16 row descriptors, 24 col descriptors
col_meta = zscore@cdesc
data_zscore = zscore@mat
#identify replicates
#perform moderated collapsing of averages
condition_id_unique = unique(col_meta$condition_id)
num_genes = dim(data_zscore)[1]
num_samples = dim(data_zscore)[2]
num_conditions = length(condition_id_unique)
data_rep_collapse = matrix(NA, nrow = num_genes, ncol = num_conditions)
all(col_meta$id == colnames(data_zscore))
[1] TRUE
for(num in 1:num_conditions){
idx = which(col_meta$condition_id == condition_id_unique[num])
cidx = which(colnames(data_zscore) %in% col_meta$id[idx])
data_rep_collapse[,num] = modzs_collapse_avg(data = data_zscore[,cidx], ridx = seq(1, num_genes, by = 1))
}
colnames(data_rep_collapse) = condition_id_unique
rownames(data_rep_collapse) = rownames(data_zscore)
Plots and QC
#Revamp column metadata since replicates are collapsed
col_meta_collapse = select(sample_meta, condition_id, pert_id, pert_dose, pert_idose, pert_dose_unit, pert_iname, pert_itime, pert_time, pert_time_unit, pert_type, project_id, strain_id, x_dose_mic_high_inoculum) %>% distinct()
col_meta_collapse_sub = data.frame(condition_id = condition_id_unique)
col_meta_collapse_sub = left_join(col_meta_collapse_sub, col_meta_collapse, by = "condition_id")
all(colnames(data_rep_collapse) == col_meta_collapse_sub$condition_id)
[1] TRUE
all(rownames(data_rep_collapse) == row_meta$gene_id)
[1] TRUE
save_to_gct(data_mat = data_rep_collapse, data_cdesc = col_meta_collapse_sub, data_rdesc = row_meta, savefilepath = paste0(outdir, paste(run_name,run_name_suffix, "modzscore_remove_low_count_samples_and_genes_rep_collapse", sep = "_")))
Saving file to /Users/sophie_chen/downloads/psa_rnaseq_manuscript_rproject-main/output/mocp_0193__modzscore_remove_low_count_samples_and_genes_rep_collapse_n40x5849.gct
Dimensions of matrix: [5849x40]
Setting precision to 4
Saved.
num_row_collapse = dim(data_rep_collapse)[1]
num_col_collapse = dim(data_rep_collapse)[2]